knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())
task_list <- c("3. Prediction", "4. Exploration")
seed = 12

source('helper_functions.R')

In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then measured the the number of exposed visual designs and variable sets, as well as the number of interacted visual designs and variable sets.

Given a search algorithm (bfs or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), the number of number of visual designs and variable sets the user will be exposed to and interact with. In addition, we would like to know if the choice of search algorithm and oracle, as well as the participant’s group (student or professional) has any meaningful impact these metrics for a user.

Our weakly-informative prior (normal(35.24, 25.33)) was derived from pilot studies, and it summarizes the number of elements (either visual design or variable set) that the user will either be exposed to or interact with. Because our pilot study was small, we chose to aggregate our data (rather than deriving separate priors for each combination of exposed/interacted variable set/visual design) to minimize the effect of biases.

The lognormal family was selected for our model to prevent our model from predicting number of iexposed/interacted elements less than zero.

prior_mean = 35.24
prior_sd = 25.33

stanvars <- stanvar(prior_mean, name='prior_mean') + stanvar(prior_sd, name='prior_sd')

Number of Interacted Variable Sets

Read in and format data

interacted_var_sets_data <- read.csv("data/num_of_interacted_variable_set.csv")
interacted_var_sets_data <- interacted_var_sets_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task),
    participant_group = as.factor(participant_group)
  )

Train model

model_interacted_var_sets <- brm(
    formula = bf(num_interacted_variable_set ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    prior = prior(normal(prior_mean, prior_sd), class = "Intercept"),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = interacted_var_sets_data,
    stanvars=stanvars,
    seed = seed,
    family = lognormal(),
    file = "models/interacted_var_sets_group"
  )

Trace plots help us check whether there is evidence of non-convergence for model.

plot(model_interacted_var_sets)

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(model_interacted_var_sets)
##  Family: lognormal 
##   Links: mu = identity; sigma = identity 
## Formula: num_interacted_variable_set ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: interacted_var_sets_data (Number of observations: 144) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.20      0.07     0.04     0.32 1.00      421      394
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                    2.92      0.12     2.69     3.14 1.00     2309
## oracledziban                 0.01      0.12    -0.24     0.25 1.00     1747
## searchdfs                   -0.02      0.13    -0.27     0.22 1.00     1609
## datasetmovies                0.15      0.09    -0.02     0.33 1.00     3225
## task4.Exploration            0.11      0.07    -0.03     0.25 1.00     4659
## participant_groupstudent    -0.01      0.09    -0.18     0.16 1.00     2796
## oracledziban:searchdfs       0.03      0.17    -0.30     0.38 1.00     1361
##                          Tail_ESS
## Intercept                    1923
## oracledziban                 1740
## searchdfs                    1648
## datasetmovies                2004
## task4.Exploration            1923
## participant_groupstudent     2302
## oracledziban:searchdfs       1742
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.42      0.04     0.36     0.49 1.00      846     1506
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
    model_interacted_var_sets,
    pars = c(
      "b_Intercept",
      "b_datasetmovies",
      "b_oracledziban",
      "b_searchdfs",
      "b_task4.Exploration",
      "b_participant_groupstudent"
    ),
    fixed = TRUE
  )

Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_interacted_var_sets <- interacted_var_sets_data %>%
  add_fitted_draws(model_interacted_var_sets, seed = seed, re_formula = NA)
draw_data_interacted_var_sets$condition <- paste(draw_data_interacted_var_sets$oracle, draw_data_interacted_var_sets$search)
plot_interacted_var_sets <- interaction_posterior_draws_plot(draw_data_interacted_var_sets, 'participant_group')
plot_interacted_var_sets

ggsave(
      file = paste("interacted_var_sets_split_group.png", sep = ""),
      plot = plot_interacted_var_sets,
      path = paste0("../plots/posterior_draws/num_interacted_variable_set")
    )
## Saving 7 x 5 in image

Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.

fit_info_interacted_var_sets <-
      draw_data_interacted_var_sets %>% group_by(search, oracle, task, participant_group) %>% mean_qi(.value, .width = c(.95, .5))

fit_info_interacted_var_sets
## # A tibble: 32 x 10
## # Groups:   search, oracle, task [8]
##    search oracle task  participant_gro… .value .lower .upper .width .point
##    <fct>  <fct>  <fct> <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr> 
##  1 bfs    compa… 3. P… professional       22.0   16.6   28.7   0.95 mean  
##  2 bfs    compa… 3. P… student            21.8   16.6   28.3   0.95 mean  
##  3 bfs    compa… 4. E… professional       24.6   18.7   32.0   0.95 mean  
##  4 bfs    compa… 4. E… student            24.4   18.6   31.7   0.95 mean  
##  5 bfs    dziban 3. P… professional       22.3   16.8   28.8   0.95 mean  
##  6 bfs    dziban 3. P… student            22.0   16.9   28.2   0.95 mean  
##  7 bfs    dziban 4. E… professional       24.9   18.9   32.4   0.95 mean  
##  8 bfs    dziban 4. E… student            24.6   18.8   31.6   0.95 mean  
##  9 dfs    compa… 3. P… professional       21.7   16.4   28.1   0.95 mean  
## 10 dfs    compa… 3. P… student            21.5   16.2   27.5   0.95 mean  
## # … with 22 more rows, and 1 more variable: .interval <chr>
write.csv(
      fit_info_interacted_var_sets,  "../plot_data/posterior_draws/num_interacted_variable_set/interacted_var_sets.csv",
      row.names = FALSE
    )

Differences between factors

We’d now like to see the difference in average interacted_var_sets_data between levels of search, oracle, and participant group for each task.

predictive_data_interacted_var_sets <- interacted_var_sets_data %>%
    add_fitted_draws(model_interacted_var_sets, seed = seed, re_formula = NA)

Difference in search

search_differences <- expected_diff_in_mean_plot(predictive_data_interacted_var_sets, "search", "Difference in Average Number Interacted Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$plot

ggsave(
      file = "search_differences.png",
      plot = search_differences$plot,
      path = "../plots/comparisons/num_interacted_variable_set"
    )
## Saving 7 x 5 in image
write.csv(search_differences$intervals, "../plot_data/comparisons/num_interacted_variable_set/search_differences.csv", row.names = FALSE)

Let’s split by dataset.

search_difference_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_interacted_var_sets, "search", "Difference in Average Number Interacted Variable Set",  "Task", "dataset")
## `summarise()` regrouping output by 'search', 'task', 'dataset' (override with `.groups` argument)
search_difference_split_by_dataset$plot

ggsave(
      file = "search_differences_split_by_dataset.png",
      plot = search_difference_split_by_dataset$plot,
      path = "../plots/comparisons/num_interacted_variable_set"
    )
## Saving 7 x 5 in image
write.csv(search_difference_split_by_dataset$intervals, "../plot_data/comparisons/num_interacted_variable_set/search_differences_split_by_dataset.csv", row.names = FALSE)

Difference in oracle

oracle_differences <- expected_diff_in_mean_plot(predictive_data_interacted_var_sets, "oracle", "Difference in Average Number Interacted Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$plot

ggsave(
      file = "oracle_differences.png",
      plot = oracle_differences$plot,
      path = "../plots/comparisons/num_interacted_variable_set"
    )
## Saving 7 x 5 in image
write.csv(oracle_differences$intervals, "../plot_data/comparisons/num_interacted_variable_set/oracle_differences.csv", row.names = FALSE)

Let’s split it by dataset

oracle_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_interacted_var_sets, "oracle", "Difference in Average Number Interacted Variable Set","Task", "dataset")
## `summarise()` regrouping output by 'oracle', 'task', 'dataset' (override with `.groups` argument)
oracle_differences_split_by_dataset$plot

ggsave(
      file = "oracle_differences_split_by_dataset.png",
      plot = oracle_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_interacted_variable_set"
    )
## Saving 7 x 5 in image
write.csv(oracle_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_interacted_variable_set/oracle_differences_split_by_dataset.csv", row.names = FALSE)

Difference in groups (Some reformatting of strings first)

predictive_data_interacted_var_sets$participant_group<- gsub('student', 'Student', predictive_data_interacted_var_sets$participant_group)
predictive_data_interacted_var_sets$participant_group<- gsub('professional', 'Professional', predictive_data_interacted_var_sets$participant_group)
participant_group_differences <- expected_diff_in_mean_plot(predictive_data_interacted_var_sets, "participant_group", "Difference in Average Number Interacted Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$plot

ggsave(
      file = "group_differences.png",
      plot = participant_group_differences$plot,
      path = "../plots/comparisons/num_interacted_variable_set"
    )
## Saving 7 x 5 in image
write.csv(participant_group_differences$intervals, "../plot_data/comparisons/num_interacted_variable_set/group_differences.csv", row.names = FALSE)

Let’s split by dataset

group_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_interacted_var_sets, "participant_group", "Difference in Average Number Interacted Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
group_differences_split_by_dataset$plot

ggsave(
      file = "group_differences_split_by_dataset.png",
      plot = group_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_interacted_variable_set"
    )
## Saving 7 x 5 in image
write.csv(group_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_interacted_variable_set/group_differences_split_by_dataset.csv", row.names = FALSE)

Number of Interacted Visual Designs

Read in and format data

interacted_visual_design_data <- read.csv("data/num_of_interacted_visual_design.csv")
interacted_visual_design_data <- interacted_visual_design_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task),
    participant_group = as.factor(participant_group)
  )

Train model

model_interacted_visual_design <- brm(
    formula = num_interacted_visual_design ~ oracle * search + dataset + task + participant_group + (1 | participant_id),
    prior = prior(normal(prior_mean, prior_sd), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = interacted_visual_design_data,
    stanvars=stanvars,
    seed = seed,
    family = lognormal(),
    file = "models/interacted_visual_design_group"
  )

Trace plots help us check whether there is evidence of non-convergence for model.

plot(model_interacted_visual_design)

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(model_interacted_visual_design)
##  Family: lognormal 
##   Links: mu = identity; sigma = identity 
## Formula: num_interacted_visual_design ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: interacted_visual_design_data (Number of observations: 144) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.23      0.08     0.04     0.36 1.01      496      418
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                    2.98      0.12     2.75     3.20 1.00     2621
## oracledziban                 0.02      0.13    -0.25     0.28 1.00     2069
## searchdfs                   -0.08      0.13    -0.33     0.18 1.00     1946
## datasetmovies                0.17      0.09    -0.01     0.33 1.00     2922
## task4.Exploration            0.10      0.07    -0.04     0.23 1.00     6431
## participant_groupstudent    -0.01      0.09    -0.19     0.16 1.00     3227
## oracledziban:searchdfs       0.06      0.18    -0.30     0.42 1.00     1897
##                          Tail_ESS
## Intercept                    2594
## oracledziban                 1857
## searchdfs                    2162
## datasetmovies                2561
## task4.Exploration            2167
## participant_groupstudent     2478
## oracledziban:searchdfs       1939
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.42      0.04     0.35     0.50 1.00      813     1368
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
    model_interacted_visual_design,
    pars = c(
      "b_Intercept",
      "b_datasetmovies",
      "b_oracledziban",
      "b_searchdfs",
      "b_task4.Exploration",
      "b_participant_groupstudent"
    ),
    fixed = TRUE
  )

Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_interacted_visual_design <- interacted_visual_design_data %>%
  add_fitted_draws(model_interacted_visual_design, seed = seed, re_formula = NA)

draw_data_interacted_visual_design$condition <- paste(draw_data_interacted_visual_design$oracle, draw_data_interacted_visual_design$search)
plot_interacted_visual_design <- interaction_posterior_draws_plot(draw_data_interacted_visual_design, 'participant_group')
plot_interacted_visual_design

ggsave(
      file = "interacted_visual_design_split_group.png",
      plot = plot_interacted_visual_design,
      path = paste0("../plots/posterior_draws/num_interacted_visual_design")
    )
## Saving 7 x 5 in image

Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.

fit_info_interacted_visual_design <-
      draw_data_interacted_visual_design %>% group_by(search, oracle, task, participant_group) %>% mean_qi(.value, .width = c(.95, .5))

fit_info_interacted_visual_design
## # A tibble: 32 x 10
## # Groups:   search, oracle, task [8]
##    search oracle task  participant_gro… .value .lower .upper .width .point
##    <fct>  <fct>  <fct> <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr> 
##  1 bfs    compa… 3. P… professional       23.6   17.6   30.9   0.95 mean  
##  2 bfs    compa… 3. P… student            23.3   17.6   30.3   0.95 mean  
##  3 bfs    compa… 4. E… professional       26.0   19.5   34.2   0.95 mean  
##  4 bfs    compa… 4. E… student            25.6   19.3   33.4   0.95 mean  
##  5 bfs    dziban 3. P… professional       24.0   18.0   31.5   0.95 mean  
##  6 bfs    dziban 3. P… student            23.7   18.0   30.7   0.95 mean  
##  7 bfs    dziban 4. E… professional       26.4   19.7   34.8   0.95 mean  
##  8 bfs    dziban 4. E… student            26.0   19.7   33.8   0.95 mean  
##  9 dfs    compa… 3. P… professional       21.8   16.2   28.7   0.95 mean  
## 10 dfs    compa… 3. P… student            21.5   16.3   28.0   0.95 mean  
## # … with 22 more rows, and 1 more variable: .interval <chr>
write.csv(
      fit_info_interacted_visual_design,  "../plot_data/posterior_draws/num_interacted_visual_design/interacted_visual_design.csv",
      row.names = FALSE
    )

Differences between factors

We’d now like to see the difference in average interacted_visual_design between levels of search, oracle, and participant group for each task.

predictive_data_interacted_visual_design <- interacted_visual_design_data %>%
    add_fitted_draws(model_interacted_visual_design, seed = seed, re_formula = NA)

Difference in search

search_differences <- expected_diff_in_mean_plot(predictive_data_interacted_visual_design, "search", "Difference in Average Number Interacted Visual Design",  "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$plot

ggsave(
      file = "search_differences.png",
      plot = search_differences$plot,
      path = "../plots/comparisons/num_interacted_visual_design"
    )
## Saving 7 x 5 in image
write.csv(search_differences$intervals, "../plot_data/comparisons/num_interacted_visual_design/search_differences.csv", row.names = FALSE)

Let’s split the above by dataset

search_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_interacted_visual_design, "search", "Difference in Average Number Interacted Visual Design",  "Task", "dataset")
## `summarise()` regrouping output by 'search', 'task', 'dataset' (override with `.groups` argument)
search_differences_split_by_dataset$plot

ggsave(
      file = "search_differences_split_by_dataset.png",
      plot = search_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_interacted_visual_design"
    )
## Saving 7 x 5 in image
write.csv(search_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_interacted_visual_design/search_differences_split_by_dataset.csv", row.names = FALSE)

Difference in oracle

oracle_differences <- expected_diff_in_mean_plot(predictive_data_interacted_visual_design, "oracle", "Difference in Average Number Interacted Visual Design",  "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$plot

ggsave(
      file = "oracle_differences.png",
      plot = oracle_differences$plot,
      path = "../plots/comparisons/num_interacted_visual_design"
    )
## Saving 7 x 5 in image
write.csv(oracle_differences$intervals, "../plot_data/comparisons/num_interacted_visual_design/oracle_differences.csv", row.names = FALSE)

Let’s split the above by dataset

oracle_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_interacted_visual_design, "oracle", "Difference in Average Number Interacted Visual Design",  "Task", "dataset")
## `summarise()` regrouping output by 'oracle', 'task', 'dataset' (override with `.groups` argument)
oracle_differences_split_by_dataset$plot

ggsave(
      file = "oracle_differences_split_by_dataset.png",
      plot = oracle_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_interacted_visual_design"
    )
## Saving 7 x 5 in image
write.csv(oracle_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_interacted_visual_design/oracle_differences_split_by_dataset.csv", row.names = FALSE)

Difference in groups (with a little string reformatting first)

predictive_data_interacted_visual_design$participant_group<- gsub('student', 'Student', predictive_data_interacted_visual_design$participant_group)
predictive_data_interacted_visual_design$participant_group<- gsub('professional', 'Professional', predictive_data_interacted_visual_design$participant_group)

participant_group_differences <- expected_diff_in_mean_plot(predictive_data_interacted_visual_design, "participant_group", "Difference in Average Number Interacted Visual Design",  "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$plot

ggsave(
      file = "group_differences.png",
      plot = participant_group_differences$plot,
      path = "../plots/comparisons/num_interacted_visual_design"
    )
## Saving 7 x 5 in image
write.csv(participant_group_differences$intervals, "../plot_data/comparisons/num_interacted_visual_design/group_differences.csv", row.names = FALSE)

Let’s split the above by dataset

group_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_interacted_visual_design, "participant_group", "Difference in Average Number Interacted Visual Design",  "Task", "dataset")
## `summarise()` regrouping output by 'participant_group', 'task', 'dataset' (override with `.groups` argument)
group_differences_split_by_dataset$plot

ggsave(
      file = "group_differences_split_by_dataset.png",
      plot = group_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_interacted_visual_design"
    )
## Saving 7 x 5 in image
write.csv(group_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_interacted_visual_design/group_differences_split_by_dataset.csv", row.names = FALSE)

Number of Exposed Variable Sets

Read in and format data

exposed_variable_set_data <- read.csv("data/num_of_exposed_variable_set.csv")
exposed_variable_set_data <- exposed_variable_set_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task),
    participant_group = as.factor(participant_group)
  )

Train model

model_exposed_variable_set <- brm(
    formula = num_exposed_variable_set ~ oracle * search + dataset + task + participant_group + (1 | participant_id),
    prior = prior(normal(prior_mean, prior_sd), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = exposed_variable_set_data,
    stanvars=stanvars,
    seed = seed,
    family = lognormal(),
    file = "models/exposed_variable_set_group"
  )

Trace plots help us check whether there is evidence of non-convergence for model.

plot(model_exposed_variable_set)

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(model_exposed_variable_set)
##  Family: lognormal 
##   Links: mu = identity; sigma = identity 
## Formula: num_exposed_variable_set ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: exposed_variable_set_data (Number of observations: 144) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.11      0.06     0.00     0.24 1.00      467      980
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                    4.01      0.10     3.81     4.21 1.00     2074
## oracledziban                 0.01      0.11    -0.22     0.23 1.00     1848
## searchdfs                   -0.35      0.11    -0.57    -0.13 1.00     1712
## datasetmovies                0.19      0.08     0.04     0.34 1.00     2819
## task4.Exploration            0.17      0.07     0.03     0.32 1.00     3791
## participant_groupstudent     0.07      0.08    -0.09     0.22 1.00     2631
## oracledziban:searchdfs       0.28      0.15    -0.01     0.57 1.00     1649
##                          Tail_ESS
## Intercept                    1998
## oracledziban                 1725
## searchdfs                    1914
## datasetmovies                2193
## task4.Exploration            2212
## participant_groupstudent     2016
## oracledziban:searchdfs       1841
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.43      0.03     0.37     0.49 1.00      888     1290
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
    model_exposed_variable_set,
    pars = c(
      "b_Intercept",
      "b_datasetmovies",
      "b_oracledziban",
      "b_searchdfs",
      "b_task4.Exploration",
      "b_participant_groupstudent"
    ),
    fixed = TRUE
  )

Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_exposed_variable_set <- exposed_variable_set_data %>%
  add_fitted_draws(model_exposed_variable_set, seed = seed, re_formula = NA)
draw_data_exposed_variable_set$condition <- paste(draw_data_exposed_variable_set$oracle, draw_data_exposed_variable_set$search)
plot_exposed_variable_set <- interaction_posterior_draws_plot(draw_data_exposed_variable_set, 'participant_group')
plot_exposed_variable_set

ggsave(
      file = "exposed_variable_set_split_group.png",
      plot = plot_exposed_variable_set,
      path = "../plots/posterior_draws/num_exposed_variable_set"
    )
## Saving 7 x 5 in image

Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.

fit_info_exposed_variable_set <-
      draw_data_exposed_variable_set %>% group_by(search, oracle, task, participant_group) %>% mean_qi(.value, .width = c(.95, .5))

fit_info_exposed_variable_set
## # A tibble: 32 x 10
## # Groups:   search, oracle, task [8]
##    search oracle task  participant_gro… .value .lower .upper .width .point
##    <fct>  <fct>  <fct> <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr> 
##  1 bfs    compa… 3. P… professional       67.2   50.9   87.2   0.95 mean  
##  2 bfs    compa… 3. P… student            71.9   54.8   92.5   0.95 mean  
##  3 bfs    compa… 4. E… professional       79.9   60.3  104.    0.95 mean  
##  4 bfs    compa… 4. E… student            85.4   64.9  110.    0.95 mean  
##  5 bfs    dziban 3. P… professional       68.0   52.3   87.4   0.95 mean  
##  6 bfs    dziban 3. P… student            72.8   55.7   93.1   0.95 mean  
##  7 bfs    dziban 4. E… professional       80.9   61.3  105.    0.95 mean  
##  8 bfs    dziban 4. E… student            86.5   65.9  111.    0.95 mean  
##  9 dfs    compa… 3. P… professional       47.3   36.2   61.3   0.95 mean  
## 10 dfs    compa… 3. P… student            50.6   38.8   64.9   0.95 mean  
## # … with 22 more rows, and 1 more variable: .interval <chr>
write.csv(
      fit_info_exposed_variable_set,  "../plot_data/posterior_draws/num_exposed_variable_set/exposed_variable_set.csv",
      row.names = FALSE
    )

Differences between factors

We’d now like to see the difference in average accuracy between levels of search, oracle, and participant group for each task.

predictive_data_exposed_variable_set <- exposed_variable_set_data %>%
    add_fitted_draws(model_exposed_variable_set, seed = seed, re_formula = NA)

Difference in search

search_differences <- expected_diff_in_mean_plot(predictive_data_exposed_variable_set, "search", "Difference in Average Number Exposed Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$plot

ggsave(
      file = "search_differences.png",
      plot = search_differences$plot,
      path = "../plots/comparisons/num_exposed_variable_set"
    )
## Saving 7 x 5 in image
write.csv(search_differences$intervals, "../plot_data/comparisons/num_exposed_variable_set/search_differences.csv", row.names = FALSE)

Let’s split the above by dataset

search_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_exposed_variable_set, "search", "Difference in Average Number Exposed Variable Set",  "Task", "dataset")
## `summarise()` regrouping output by 'search', 'task', 'dataset' (override with `.groups` argument)
search_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60))

ggsave(
      file = "search_differences_split_by_dataset.png",
      plot = search_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60)),
      path = "../plots/comparisons/num_exposed_variable_set"
    )
## Saving 7 x 5 in image
write.csv(search_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_exposed_variable_set/search_differences_split_by_dataset.csv", row.names = FALSE)

Difference in oracle

oracle_differences <- expected_diff_in_mean_plot(predictive_data_exposed_variable_set, "oracle", "Difference in Average Number Exposed Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$plot

ggsave(
      file = "oracle_differences.png",
      plot = oracle_differences$plot,
      path = paste0("../plots/comparisons/num_exposed_variable_set")
    )
## Saving 7 x 5 in image
write.csv(oracle_differences$intervals, "../plot_data/comparisons/num_exposed_variable_set/oracle_differences.csv", row.names = FALSE)

Let’s split the above by dataset

oracle_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_exposed_variable_set, "oracle", "Difference in Average Number Exposed Variable Set",  "Task", "dataset")
## `summarise()` regrouping output by 'oracle', 'task', 'dataset' (override with `.groups` argument)
oracle_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60))

ggsave(
      file = "oracle_differences_split_by_dataset.png",
      plot = oracle_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60)),
      path = "../plots/comparisons/num_exposed_variable_set"
    )
## Saving 7 x 5 in image
write.csv(oracle_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_exposed_variable_set/oracle_differences_split_by_dataset.csv", row.names = FALSE)

Difference in groups (with some string reformatting)

predictive_data_exposed_variable_set$participant_group<- gsub('student', 'Student', predictive_data_exposed_variable_set$participant_group)
predictive_data_exposed_variable_set$participant_group<- gsub('professional', 'Professional', predictive_data_exposed_variable_set$participant_group)
predictive_data_exposed_variable_set$dataset<- gsub('birdstrikes', 'Birdstrikes', predictive_data_exposed_variable_set$dataset)
predictive_data_exposed_variable_set$dataset<- gsub('movies', 'Movies', predictive_data_exposed_variable_set$dataset)
predictive_data_exposed_variable_set$Dataset<- predictive_data_exposed_variable_set$dataset
participant_group_differences <- expected_diff_in_mean_plot(predictive_data_exposed_variable_set, "participant_group", "Difference in Average Number Exposed Variable Set",  "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$plot

ggsave(
      file = "group_differences.png",
      plot = participant_group_differences$plot,
      path = "../plots/comparisons/num_exposed_variable_set"
    )
## Saving 7 x 5 in image
write.csv(participant_group_differences$intervals, "../plot_data/comparisons/num_exposed_variable_set/group_differences.csv", row.names = FALSE)

Let’s split the above by dataset

group_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_exposed_variable_set, "participant_group", "Difference in Average Number Exposed Variable Set",  "Task", "dataset")
## `summarise()` regrouping output by 'participant_group', 'task', 'dataset' (override with `.groups` argument)
group_differences_split_by_dataset$plot

ggsave(
      file = "group_differences_split_by_dataset.png",
      plot = group_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_exposed_variable_set"
    )
## Saving 7 x 5 in image
write.csv(group_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_exposed_variable_set/group_differences_split_by_dataset.csv", row.names = FALSE)

Number of Exposed Visual Designs

Read in and format data

exposed_visual_design_data <- read.csv("data/num_of_exposed_visual_design.csv")
exposed_visual_design_data <- exposed_visual_design_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task),
    participant_group = as.factor(participant_group)
  )

Train model

model_exposed_visual_design <- brm(
    formula = num_exposed_visual_design ~ oracle * search + dataset + task + participant_group + (1 | participant_id),
    prior = prior(normal(prior_mean, prior_sd), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = exposed_visual_design_data,
    stanvars=stanvars,
    seed = seed,
    family = lognormal(),
    file = "models/exposed_visual_design_group"
  )

Trace plots help us check whether there is evidence of non-convergence for model.

plot(model_exposed_visual_design)

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(model_exposed_visual_design)
##  Family: lognormal 
##   Links: mu = identity; sigma = identity 
## Formula: num_exposed_visual_design ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: exposed_visual_design_data (Number of observations: 144) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.13      0.07     0.01     0.27 1.01      491      823
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                    4.07      0.11     3.85     4.29 1.00     1793
## oracledziban                 0.09      0.12    -0.15     0.33 1.00     1347
## searchdfs                   -0.42      0.12    -0.66    -0.18 1.00     1339
## datasetmovies                0.19      0.09     0.02     0.36 1.00     2053
## task4.Exploration            0.19      0.07     0.04     0.33 1.00     3493
## participant_groupstudent     0.07      0.08    -0.09     0.23 1.00     1843
## oracledziban:searchdfs       0.35      0.17     0.02     0.69 1.00     1076
##                          Tail_ESS
## Intercept                    1916
## oracledziban                 1630
## searchdfs                    1627
## datasetmovies                1823
## task4.Exploration            1954
## participant_groupstudent     1769
## oracledziban:searchdfs       1481
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.46      0.03     0.40     0.53 1.00     1097     1178
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
    model_exposed_visual_design,
    pars = c(
      "b_Intercept",
      "b_datasetmovies",
      "b_oracledziban",
      "b_searchdfs",
      "b_task4.Exploration",
      "b_participant_groupstudent"
    ),
    fixed = TRUE
  )

Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_exposed_visual_design <- exposed_visual_design_data %>%
  add_fitted_draws(model_exposed_visual_design, seed = seed, re_formula = NA)
draw_data_exposed_visual_design$condition <- paste(draw_data_exposed_visual_design$oracle, draw_data_exposed_visual_design$search)
plot_exposed_visual_design <- interaction_posterior_draws_plot(draw_data_exposed_visual_design, 'participant_group')
plot_exposed_visual_design

ggsave(
      file = paste("exposed_visual_design_split_group.png", sep = ""),
      plot = plot_exposed_visual_design,
      path = "../plots/posterior_draws/num_exposed_visual_design"
    )
## Saving 7 x 5 in image

Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.

fit_info_exposed_visual_design <-
      draw_data_exposed_visual_design %>% group_by(search, oracle, task, participant_group) %>% mean_qi(.value, .width = c(.95, .5))

fit_info_exposed_visual_design
## # A tibble: 32 x 10
## # Groups:   search, oracle, task [8]
##    search oracle task  participant_gro… .value .lower .upper .width .point
##    <fct>  <fct>  <fct> <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr> 
##  1 bfs    compa… 3. P… professional       72.1   54.1   93.8   0.95 mean  
##  2 bfs    compa… 3. P… student            77.2   58.3  101.    0.95 mean  
##  3 bfs    compa… 4. E… professional       87.1   64.8  113.    0.95 mean  
##  4 bfs    compa… 4. E… student            93.2   69.9  121.    0.95 mean  
##  5 bfs    dziban 3. P… professional       78.8   59.0  103.    0.95 mean  
##  6 bfs    dziban 3. P… student            84.3   63.6  110.    0.95 mean  
##  7 bfs    dziban 4. E… professional       95.1   70.8  124.    0.95 mean  
##  8 bfs    dziban 4. E… student           102.    77.1  132.    0.95 mean  
##  9 dfs    compa… 3. P… professional       47.6   35.5   62.4   0.95 mean  
## 10 dfs    compa… 3. P… student            51.0   38.2   66.8   0.95 mean  
## # … with 22 more rows, and 1 more variable: .interval <chr>
write.csv(
      fit_info_exposed_visual_design,  "../plot_data/posterior_draws/num_exposed_visual_design/exposed_visual_design.csv",
      row.names = FALSE
    )

Differences between factors

We’d now like to see the difference in average accuracy between levels of search, oracle, and participant group for each task.

predictive_data_exposed_visual_design <- exposed_visual_design_data %>%
    add_fitted_draws(model_exposed_visual_design, seed = seed, re_formula = NA)

Difference in search

search_differences <- expected_diff_in_mean_plot(predictive_data_exposed_visual_design, "search", "Difference in Average Number Exposed Visual Design",  "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$plot

ggsave(
      file = "search_differences.png",
      plot = search_differences$plot,
      path = "../plots/comparisons/num_exposed_visual_design"
    )
## Saving 7 x 5 in image
write.csv(search_differences$intervals, "../plot_data/comparisons/num_exposed_visual_design/search_differences.csv", row.names = FALSE)

The above but split by datasets

search_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_exposed_visual_design, "search", "Difference in Average Number Exposed Visual Design",  "Task", "dataset")
## `summarise()` regrouping output by 'search', 'task', 'dataset' (override with `.groups` argument)
search_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60))

ggsave(
      file = "search_differences_split_by_dataset.png",
      plot = search_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60)),
      path = "../plots/comparisons/num_exposed_visual_design"
    )
## Saving 7 x 5 in image
write.csv(search_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_exposed_visual_design/search_differences_split_by_dataset.csv", row.names = FALSE)

Difference in oracle

oracle_differences <- expected_diff_in_mean_plot(predictive_data_exposed_visual_design, "oracle", "Difference in Average Number Exposed Visual Design",  "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$plot

ggsave(
      file = "oracle_differences.png",
      plot = oracle_differences$plot,
      path = paste0("../plots/comparisons/num_exposed_visual_design")
    )
## Saving 7 x 5 in image
write.csv(oracle_differences$intervals, "../plot_data/comparisons/num_exposed_visual_design/oracle_differences.csv", row.names = FALSE)

The above but split by datasets

oracle_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_exposed_visual_design, "oracle", "Difference in Average Number Exposed Visual Design",  "Task", "dataset")
## `summarise()` regrouping output by 'oracle', 'task', 'dataset' (override with `.groups` argument)
oracle_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60))

ggsave(
      file = "oracle_differences_split_by_dataset.png",
      plot = oracle_differences_split_by_dataset$plot + coord_cartesian(xlim = c(-15, 60)),
      path = "../plots/comparisons/num_exposed_visual_design"
    )
## Saving 7 x 5 in image
write.csv(oracle_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_exposed_visual_design/oracle_differences_split_by_dataset.csv", row.names = FALSE)

Difference in groups

participant_group_differences <- expected_diff_in_mean_plot(predictive_data_exposed_visual_design, "participant_group", "Difference in Average Number Exposed Visual Design",  "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$plot

ggsave(
      file = "group_differences.png",
      plot = participant_group_differences$plot,
      path = "../plots/comparisons/num_exposed_visual_design"
    )
## Saving 7 x 5 in image
write.csv(participant_group_differences$intervals, "../plot_data/comparisons/num_exposed_visual_design/participant_group_differences.csv", row.names = FALSE)

The above but split by datasets

group_differences_split_by_dataset <- expected_diff_in_mean_plot(predictive_data_exposed_visual_design, "participant_group", "Difference in Average Number Exposed Visual Design",  "Task", "dataset")
## `summarise()` regrouping output by 'participant_group', 'task', 'dataset' (override with `.groups` argument)
group_differences_split_by_dataset$plot

ggsave(
      file = "group_differences_split_by_dataset.png",
      plot = group_differences_split_by_dataset$plot,
      path = "../plots/comparisons/num_exposed_visual_design"
    )
## Saving 7 x 5 in image
write.csv(group_differences_split_by_dataset$intervals, "../plot_data/comparisons/num_exposed_visual_design/group_differences_split_by_dataset.csv", row.names = FALSE)

Summary Plots

Draws from the posterior, comparing the number of exposed and interacted variable sets.

draw_data_exposed_variable_set$category <- "exposed"
draw_data_interacted_var_sets$category <- "interacted"

data_conbined <- rbind(draw_data_exposed_variable_set, draw_data_interacted_var_sets)
plot_var_set <- data_conbined %>% ggplot(aes(x = oracle, y = .value, fill = category, alpha = 0.5)) +
      stat_eye(.width = c(.95, .5)) +
      theme_minimal() +
      facet_grid(task ~ search) + 
      ylab("Number of Elements") 
    
plot_var_set

ggsave(
      file = "interaction_var_sets.png",
      plot = plot_var_set,
      path = "../plots/posterior_draws"
    )
## Saving 7 x 5 in image

Draws from the posterior, comparing the number of exposed and interacted visual designs.

draw_data_exposed_visual_design$category <- "exposed"
draw_data_interacted_visual_design$category <- "interacted"

data_conbined <- rbind(draw_data_exposed_visual_design, draw_data_interacted_visual_design)
plot_vis_design <- data_conbined %>% ggplot(aes(x = oracle, y = .value, fill = category, alpha = 0.5)) +
      stat_eye(.width = c(.95, .5)) +
      theme_minimal() +
      facet_grid(task ~ search) + 
      ylab("Number of Elements") 
    
plot_vis_design

ggsave(
      file = "interaction_vis_design.png",
      plot = plot_vis_design,
      path = "../plots/posterior_draws"
    )
## Saving 7 x 5 in image

Draws from the posterior, comparing the number of exposed visual sets and variable designs.

draw_data_exposed_variable_set$type <- "Variable Sets"
draw_data_exposed_visual_design$type <- "Visual Design"

data_conbined <- rbind(draw_data_exposed_variable_set, draw_data_exposed_visual_design)
data_conbined$oracle<- gsub('compassql', 'CompassQL', data_conbined$oracle)
data_conbined$oracle<- gsub('dziban', 'Dziban', data_conbined$oracle)
data_conbined$search<- gsub('bfs', 'BFS', data_conbined$search)
data_conbined$search<- gsub('dfs', 'DFS', data_conbined$search)
data_conbined$Type<- data_conbined$type

data_conbined$condition <- paste(data_conbined$oracle, data_conbined$search, sep="\n")

plot_exposed <- data_conbined %>% ggplot(aes(x = .value, y = reorder(condition, desc(condition)), fill = type, alpha = 0.5)) +
      stat_halfeye(.width = c(.95, .5)) +
      facet_grid(. ~ task) + 
      xlab("Predicted Average Number of Elements") + ylab("Oracle/Search Combination") + scale_alpha(guide = 'none') +  scale_fill_manual(values=c("#FFA33F", "#B6EE56"))
    
plot_exposed

ggsave(
      file = "exposed.png",
      plot = plot_exposed,
      path = "../plots/posterior_draws"
    )
## Saving 7 x 5 in image
plot_exposed_search_only <- data_conbined %>% ggplot(aes(x = search, y = .value, fill = type, alpha = 0.5)) +
      stat_eye(.width = c(.95, .5)) +
      theme_minimal() +
      facet_grid(task ~ .) + 
      scale_y_continuous(breaks = seq(0, 180, by = 30), limits=c(0,180))+
      ylab("Average Number of Elements") + xlab("Oracle/Search Combination") + scale_alpha(guide = 'none')
    
plot_exposed_search_only

ggsave(
      file = "exposed_search_only.png",
      plot = plot_exposed_search_only,
      path = "../plots/posterior_draws"
    )
## Saving 7 x 5 in image

Draws from the posterior, comparing the number of interacted visual sets and variable designs.

draw_data_interacted_var_sets$type <- "Variable Sets"
draw_data_interacted_visual_design$type <- "Visual Design"

data_conbined <- rbind(draw_data_interacted_var_sets, draw_data_interacted_visual_design)

data_conbined$oracle<- gsub('compassql', 'CompassQL', data_conbined$oracle)
data_conbined$oracle<- gsub('dziban', 'Dziban', data_conbined$oracle)
data_conbined$search<- gsub('bfs', 'BFS', data_conbined$search)
data_conbined$search<- gsub('dfs', 'DFS', data_conbined$search)
data_conbined$Type<- data_conbined$type

plot_interacted <- data_conbined %>% ggplot(aes(x = .value, y = reorder(condition, desc(condition)), fill = type, alpha = 0.5)) +
      stat_halfeye(.width = c(.95, .5)) +
      facet_grid(. ~ task) + 
      xlab("Predicted Average Number of Elements") + ylab("Oracle/Search Combination") + scale_alpha(guide = 'none') +  scale_fill_manual(values=c("#FFA33F", "#B6EE56"))
    
plot_interacted

ggsave(
      file = "interacted.png",
      plot = plot_interacted,
      path = "../plots/posterior_draws"
    )
## Saving 7 x 5 in image